################################
## Script: 09_pull_together_data.R
## Purpose: This code pulls together the final data for the
## gpt4o annotations. It also creates coding sets for
## the gpt4o precision test.  
## Data In:
## 1) Zero shot GPT4o annotations
## all files with the form
## data/gpt4o_annotations/annotate_[num].rds
## where "num" goes from 1-100
## 2) Zero shot GPT4o annotations, alternative
## prompt
## all files with the form
## data/gpt4o_annotations_rit/annotate_rit_alt_[num].rds
## where "num" goes from 1-100
## 3) Fine tuned LLM annotations
## all files with the form 
## data/gpt4o_finetune_annotations/annotate_[num].rds
## where "num" goes from 1-100
## 4) training pairs for fine tuning
## data/train_fine_tune_ids.rds
## Data Out:
## 1) Candidate pairs labelled by zero shot LLM
## data/gpt_annotations_full.rds 
## 2) Candidate pairs labelled by strict zero shot LLM
## data/gpt_annotations_full_strict_rit.rds
## 3) Candidate pairs labelled by fine tuned LLM
## data/gpt_annotations_finetune_full.rds
## 4) Precision coding sets for zero shot 
## annotator:
## a) master file: data/precision_gpt4o_update_test_master.rds
## b) Set 1: data/precision_gpt4o_update_test_ra1.csv
## b) Set 2: data/precision_gpt4o_update_test_ra2.csv
## 5) Precision coding sets for Zero shot annotator,
## strict prompts:
## a) Master file: data/precision_gpt4o_strict_rit_test_master.rds
## b) Set 1: data/precision_gpt4o_strict_test_ra1.csv
## c) Set 2: data/precision_gpt4o_strict_test_ra2.csv
## 6) Precision coding sets for fine tuned annotator
## a) Master file: data/precision_gpt4o_finetune_master.rds
## b) Set 1: data/precision_gpt4o_finetune_ra1.csv
## c) Set 2: data/precision_gpt4o_finetune_ra2.csv
## c) Set 3: data/precision_gpt4o_finetune_ra3.csv
## Notes:

library(openai)
library(dplyr)
library(lubridate)

'%ni%' <- Negate("%in%")

#Sys.setenv(
#  OPENAI_API_KEY = ''
#)

########################################
### Read in Data: Zero Shot ############
########################################

annotations <- list.files("data/gpt4o_annotations/")
annotations <- annotations[!grepl("lists", annotations)]
annotations_num <- gsub("annotate_|\\.rds", "", annotations)

sum(1:100 %ni% as.numeric(annotations_num)) ## none missing
## should be 100 files

annotations <- paste0("data/gpt4o_annotations/",
                      annotations)
data <- list()
for(i in 1:length(annotations)){
  data[[i]] <- readRDS(annotations[i])
}
data <- bind_rows(data)

#########################################
### Re-run Same Claim (Zero Shot) #######
#########################################

## Here we rerun cases where the API
## errored out. We had a lot more cases 
## of this when we ran these annotations then 
## all other annotations. 

## Note to replicators:
## you don't need to run this code because
## we have saved a derivative product below.

prompt_compare_subject <- "You will be provided with the lists of the people, places, objects, and events discussed in two paragraphs. Based on these
lists, do the two paragraphs discuss the vast majority of the same people, places, objects, and events?"
prompt_compare_claim <- "You will be provided with the lists of descriptive, normative, conceptual, and causal claims discussed in two paragraphs. 
Based on these lists, do the two paragraphs discuss the vast majority of the same claims?" 

## re-running same claim 
torun <- which(is.na(data$gpt4o_same_claim_2))

for(j in 1:length(torun)){
  prompt_c <- paste0(prompt_compare_claim, "\n\n\n # **List for Paragraph 1**: ",
                     data$ego_claim[torun[j]], "\n\n\n # **List for Paragraph 2**: ",
                     data$alter_claim[torun[j]], "\n\n\n # **Your Label (Respond only 'YES' or 'NO')**: ")
  m <- create_chat_completion(model = "gpt-4o-2024-05-13",
                              messages = list(list(role = "user",
                                                   content = prompt_c)),
                              temperature = 0)
  data$gpt4o_same_claim_2[torun[j]] <- m$choices$message.content
  print(i)
}

## re-running missing same subject
## for cases which were labelled YES
sum(is.na(data$gpt4o_same_subject_2)) ## 8,563
table(data$gpt4o_same_claim_2)

torun2 <- which(is.na(data$gpt4o_same_subject_2) &
                  data$gpt4o_same_claim_2 == "YES")
## not re-running for NA values where claim
## was "NO"

for(j in 1:length(torun2)){
  prompt_s <- paste0(prompt_compare_subject, "\n\n\n # **List for Paragraph 1**: ",
                     data$ego_subject[torun2[j]], "\n\n\n # **List for Paragraph 2**: ",
                     data$alter_subject[torun2[j]], "\n\n\n # **Your Label (Respond only 'YES' or 'NO')**: ")
  
  m <- create_chat_completion(model = "gpt-4o-2024-05-13",
                              messages = list(list(role = "user",
                                                   content = prompt_s)),
                              temperature = 0)
  data$gpt4o_same_subject_2[torun2[j]] <- m$choices$message.content
  print(j)
}


## create final variable 
data$gpt_annotation <- ifelse(data$gpt4o_same_claim_2 == "YES" &
                                data$gpt4o_same_subject_2 == "YES",
                              "YES", "NO")
table(data$gpt_annotation) ## 18,138 predicted positive 


#saveRDS(data, "data/gpt_annotations_full.rds")

data <- readRDS("data/gpt_annotations_full.rds")


#############################################
### Read in Data: Strict Version ############
#############################################

## Note: these are annotations from a previous
## version of our zero shot llm annotator. We don't reference
## this in the paper but we include it in the cleaning code
## here because we included labeled output from this
## annotator in the dataset used for fine tuning. 

annotations_alt <- list.files("data/gpt4o_annotations_rit/")
annotations_num <- gsub("annotate_rit_alt_|\\.rds", "", annotations_alt)

sum(1:100 %ni% as.numeric(annotations_num)) ## none missing
## should be 100 files

annotations_alt <- paste0("data/gpt4o_annotations_rit/",
                          annotations_alt)

data_v2 <- list()
for(i in 1:length(annotations_alt)){
  data_v2[[i]] <- readRDS(annotations_alt[i])
}
data_v2 <- bind_rows(data_v2)

data_v2$gpt_annotation_strict <- ifelse(data_v2$same_subject == "YES" &
                                          !is.na(data_v2$same_claim) &
                                          data_v2$same_claim == "YES",
                                        "YES", "NO")
table(data_v2$gpt_annotation_strict) ## 10,648 predicted positive

sum(is.na(data_v2$same_subject)) ## 0
sum(is.na(data_v2$same_claim[data_v2$same_subject == "YES"]))
## so we don't do any re-running here 


#saveRDS(data_v2, "data/gpt_annotations_full_strict_rit.rds")

data_v2 <- readRDS("data/gpt_annotations_full_strict_rit.rds")

#############################################
### Read in Data: Fine Tuning ################
#############################################

## gpt4 fine tune training ids
## (i.e. pairs used in fine tuning)
fine_tune_train_ids <- readRDS("data/train_fine_tune_ids.rds")


annotations_finetune <- list.files("data/gpt4o_finetune_annotations/")
annotations_num <- gsub("annotate_|\\.rds", "", annotations_finetune)

## checking if all read in: 
sum(1:100 %ni% as.numeric(annotations_num)) ## 0

annotations_finetune <- paste0("data/gpt4o_finetune_annotations/",
                          annotations_finetune)

data_finetune <- list()
for(i in 1:length(annotations_finetune)){
  data_finetune[[i]] <- readRDS(annotations_finetune[i])
}
data_finetune <- bind_rows(data_finetune)

table(data_finetune$gpt4o_fine_tune_same_subject)
sum(is.na(data_finetune$gpt4o_fine_tune_same_subject)) 
## 17 no label - need to be re-run

## Note: we identified these three additional
## missing values while creating the replication code
## after all the data labeling was finished. As such 
## we exclude these data. Given the small number the
## exclusion of these 
## three pairs are extremely unlikely to have any substantial
## impact on the findings. Most likely they were negative 
## cases. 
sum(is.na(data_finetune$gpt4o_fine_tune_same_claim[
  data_finetune$gpt4o_fine_tune_same_subject == "YES" &
     !is.na(data_finetune$gpt4o_fine_tune_same_subject)]))
## 3

#######################################
### Rerun Same Claim Fine Tune Labels
######################################

## Re-running 17 pairs where 
## same subject annotator errored out

## Note to replicators:
## you don't need to run this code because
## we have saved a derivative product below.

torun <- which(is.na(data_finetune$gpt4o_fine_tune_same_subject))

prompt_compare_subject <- "You will be provided with the lists of the people, places, objects, and events discussed in two paragraphs. Based on these
lists, do the two paragraphs discuss the vast majority of the same people, places, objects, and events?"

api_fct_subject <- function(x){
  tryCatch(
    expr = {
      m <- create_chat_completion(model = "ft:gpt-4o-2024-08-06:personal::Acor6lGL",
                                  messages = list(list(role = "system",
                                                       content = prompt_compare_subject),
                                                  list(role = "user",
                                                       content = x)),
                                  temperature = 0)
      m$choices$message.content
    },
    error = function(e){
      message('Caught an error!')
      Sys.sleep(sl)
      sl <- sl + .1
      NA
    }
  )    
}


rerun_annotation <- list()
for(i in 1:length(torun)){
  prompt_s <- paste0("\n **Paragraph 1**: ", data_finetune$ego_subject[torun[i]],
                     "\n **Paragraph 2**: ", data_finetune$alter_subject[torun[i]],
                     "\n", "**Your label (Respond only with 'YES' or 'NO')**:")
  rerun_annotation[[i]] <- api_fct_subject(prompt_s)
  print(i)
}
unlist(rerun_annotation) ## all NOs
## so we give a final label of No 
## for whether same claim, same subject


data_finetune$gpt4o_finetune <- ifelse(data_finetune$gpt4o_fine_tune_same_subject == "YES" &
                                         !is.na(data_finetune$gpt4o_fine_tune_same_claim) &
                                         data_finetune$gpt4o_fine_tune_same_claim == "YES",
                                       "YES", "NO")

table(data_finetune$gpt4o_finetune) ## 4,204 predicted positive

#saveRDS(data_finetune, "data/gpt_annotations_finetune_full.rds")

data_finetune <- readRDS("data/gpt_annotations_finetune_full.rds")




########################################################
### Creating Precision Coding Set: Zero Shot ###########
########################################################

## In the final sections of this code file
## we take random draws from predicted 
## positive cases from each of our three LLM annotators.
## These were then labelled by human coders in a precision
## test. 

## create indicator for whether within three day window
## for zero shot annotator, we compare
## precision by whether in three day range or not
## downstream
data$date_window_3 <- ifelse(as.Date(data$ego_date) <= as.Date(data$alter_date) + days(1) &
                               as.Date(data$ego_date) >= as.Date(data$alter_date) - days(1),
                             TRUE, FALSE)

## limit to predicted positive cases
tolabel <- data %>%
  filter(gpt_annotation == "YES")

## random sample
set.seed(08540)
samp <- sample(1:nrow(tolabel), 100, replace = FALSE)

tolabel <- tolabel[samp, ]
table(tolabel$date_window_3) ## 63 in date window 

## create two coding files
tolabel$RA <- c(rep("Set1", 50),
                rep("Set2", 50))

tolabel$same_subject <- NA
tolabel$same_claim <- NA

#write.csv(tolabel[tolabel$RA == "Set1", c("ego_id",
 #                            "alter_id",
#                             "ego_summary",
#                           "alter_summary",
#                            "same_subject",
 #                          "same_claim")],
#         "data/precision_gpt4o_update_test_ra1.csv")

#write.csv(tolabel[tolabel$RA == "Set2", c("ego_id",
#                                          "alter_id",
#                                          "ego_summary",
#                                          "alter_summary",
#                                          "same_subject",
#                                          "same_claim")],
#          "data/precision_gpt4o_update_test_ra2.csv")

#saveRDS(tolabel, "data/precision_gpt4o_update_test_master.rds")

##############################################################
### Creating Precision Coding Set: Strict Annotator ###########
##############################################################

## Sample precision set for 
## Strict annotator (Zero shot)

## create indicator for whether within three day window

data_v2$date_window_3 <- ifelse(as.Date(data_v2$ego_date) <= as.Date(data_v2$alter_date) + days(1) &
                               as.Date(data_v2$ego_date) >= as.Date(data_v2$alter_date) - days(1),
                             TRUE, FALSE)

## limit to predicted posistive cases
tolabel <- data_v2 %>%
  filter(gpt_annotation_strict == "YES")

set.seed(08540)
samp <- sample(1:nrow(tolabel), 100, replace = FALSE)

tolabel <- tolabel[samp, ]
table(tolabel$date_window_3) ## 71 in date window 

## create two coding files
tolabel$RA <- c(rep("Set1", 50),
                rep("Set2", 50))

tolabel$same_subject <- NA
tolabel$same_claim <- NA

#write.csv(tolabel[tolabel$RA == "Set1", c("ego_id",
#                             "alter_id",
#                             "ego_summary",
#                           "alter_summary",
#                            "same_subject",
#                           "same_claim")],
#         "data/precision_gpt4o_strict_test_ra1.csv")

#write.csv(tolabel[tolabel$RA == "Set2", c("ego_id",
#                                          "alter_id",
#                                          "ego_summary",
#                                          "alter_summary",
#                                          "same_subject",
 #                                         "same_claim")],
#         "data/precision_gpt4o_strict_test_ra2.csv")

#saveRDS(tolabel, "data/precision_gpt4o_strict_rit_test_master.rds")

##############################################################
### Creating Precision Coding Set: Fine Tuning  ##############
##############################################################

## Same process for labels by fine tuned annotator

tolabel <- data_finetune %>%
  filter(gpt4o_finetune == "YES")


## we remove from labeling
## articles which were included as focal or alter id in training set 

## Note: When we were creating this dataset we made a slight error,
## as we only removed focal or alter ids that were used in training 
## of same claim annotator, not same subject subject
## annotator (an additional 100 article IDs)
## we remove these later ones in the precision analysis. We didn't edit
## this code because these are pairs labelled by our 
## human RAs. 

'%ni%' <- Negate("%in%")
tolabel <- tolabel %>%
  filter(ego_id %ni% fine_tune_train_ids &
           alter_id %ni% fine_tune_train_ids)

set.seed(08540)
samp <- sample(1:nrow(tolabel), 100, replace = FALSE)

tolabel <- tolabel[samp, ]

## create two coding files
tolabel$RA <- c(rep("Set1", 35),
                rep("Set2", 35),
                rep("Set3", 30))

tolabel$same_subject <- NA
tolabel$same_claim <- NA

#write.csv(tolabel[tolabel$RA == "Set1", c("ego_id",
#                             "alter_id",
#                             "ego_summary",
#                           "alter_summary",
#                            "same_subject",
#                           "same_claim")],
#         "data/precision_gpt4o_finetune_ra1.csv")

#write.csv(tolabel[tolabel$RA == "Set2", c("ego_id",
#                                          "alter_id",
#                                          "ego_summary",
#                                          "alter_summary",
#                                          "same_subject",
#                                          "same_claim")],
 #        "data/precision_gpt4o_finetune_ra2.csv")

#write.csv(tolabel[tolabel$RA == "Set3", c("ego_id",
#                                          "alter_id",
#                                          "ego_summary",
#                                          "alter_summary",
#                                          "same_subject",
#                                          "same_claim")],
#         "data/precision_gpt4o_finetune_ra3.csv")

#saveRDS(tolabel, "data/precision_gpt4o_finetune_master.rds")
